Combined analysis of Fucci fluorescence and cDNA concentration

Summary

After visual inspection, chambers with no cells or with defect in the imaging were flagged for removal. The DNA yield for the same chambers was also very low. Conversely, in the absence of cells, the proportion of spikes was highest. This confirms the accuracy of the visual inspection and confirms that the conversion between C1 chip coordinates and 96-well plate coordinates was done correctly.

In the output file combined.csv, a column called Discard indicates if the cell fails any of the quality controls.

Datasets

Quality control

library(gdata)   # for drop.levels()
library(ggplot2) # for the plots
library(scales)  # for trans_new()

Load and merge the fluorescence and concentration values

The qc table is assembled by merging multiple data sources. It is then saved as qc.full. In the steps that follow, entries that do not pass quality controls will be removed from the qc table.

fl <- read.csv("../fluorescence/Results_fluorescence.csv")
fl$Error <- factor(fl$Error)
fl <- fl[,c(1,5,12,17,24,28,30,31,32)]

correctedFl <- read.csv('../Intensity_correction/correctedIntensities.csv')
correctedFl <- correctedFl[,1:3]

qc <- merge(fl, correctedFl[,1:3], all=TRUE)

# pg as short name for picogreen
pg <- read.csv('../cDNA_concentration/cDNA_concentration.csv')
pg$Column <- factor(pg$Column)
pg$cell_id <- paste(pg$Run, pg$Well, sep='_')
qc <- merge(pg, qc, by=c('cell_id', 'Run', 'Well'), all=TRUE)

spikes <- read.csv('../control-sequences/spikes.norm.csv')
qc <- merge(qc, spikes, all=TRUE)

controls <- read.csv('../combine_all/controls.csv')
summary(controls)
##            Run         Well   Control
##  1772-062-248:2   A02    :2   NC:5   
##  1772-062-249:2   A10    :1   PC:5   
##  1772-064-103:2   B01    :1          
##  1772-067-038:2   C09    :1          
##  1772-067-039:2   F08    :1          
##                   F12    :1          
##                   (Other):3
controls$cell_id <- paste(controls$Run, controls$Well, sep='_')
qc <- merge(qc, controls, by=c('cell_id', 'Run', 'Well'), all=TRUE)
rownames(qc) <- qc$cell_id

hiseq <- read.csv('../HiSeq/HiSeq.csv')
hiseq <- hiseq[,c(1,8:17,19,20)]
qc <- merge(qc, hiseq, by=c('cell_id', 'Run', 'Well', 'Row', 'Column'))

# replace error type with numbers
error <- sapply(strsplit(as.character(qc.full$Error),"-", fixed = TRUE),"[[", 1)
## Error in strsplit(as.character(qc.full$Error), "-", fixed = TRUE): object 'qc.full' not found
qc$Error <- error
## Error in eval(expr, envir, enclos): object 'error' not found
qc.full <- qc

Remove the samples that were replaced by positive or negative controls.

qc <- subset(qc, is.na(qc$Control))

Visual curation

Visual curation of the fluorescence pictures (Error field, see Fluorescence-measured-in-ImageJ.html) eliminated the chambers where it was not sure wether a healthy single cell was captured, in good concordance with the DNA yields. In the absence of a cell the libraries are mostly made of spikes.

Fluorescence.

Remove the cells for which there are no image files.

qc <- subset(qc, !is.na(qc$Error))
qplot(data = qc, Error, mean_ch2 + mean_ch3, geom = "boxplot"
) + facet_wrap(~Run, scales = "free") + ggtitle('Uncorrected fluorescence by error type') + scale_x_discrete('Error type: 0 = cell present; 1 = cell absent; 2 = debris; 3 = wrong focus; 4 = more than 1 cell')

plot of chunk qc_fluo_by_errortype Back to top

DNA concentration.

qplot(data = qc, Error, Concentration, geom = "boxplot"
) + facet_wrap(~Run, scales = "free") + ggtitle('DNA concentration by error type') + scale_x_discrete('Error type: 0 = cell present; 1 = cell absent; 2 = debris; 3 = wrong focus; 4 = more than 1 cell') + scale_y_continuous('DNA yield (ng/nL)')

plot of chunk qc_concentration_by_errortype Back to top

18S rRNA.

qplot(data = qc, Error, rRNA_18S, geom = "boxplot"
) + facet_wrap(~Run, scales = "free") + ggtitle('18S rRNA by error type') + scale_x_discrete('Error type: 0 = cell present; 1 = cell absent; 2 = debris; 3 = wrong focus; 4 = more than 1 cell') + scale_y_continuous('rRNA 18S (CPM)')
## Warning: Removed 5 rows containing non-finite values (stat_boxplot).

plot of chunk qc_rRNA_by_errortype

Back to top

Spike 1.

qplot(data = qc, Error, SPIKE_1, geom = "boxplot") + facet_wrap(~Run, scales = "free") + ggtitle('Spike 1 by error type') + scale_x_discrete('Error type: 0 = cell present; 1 = cell absent; 2 = debris; 3 = wrong focus; 4 = more than 1 cell') + scale_y_continuous('Spike 1 (CPM)')
## Warning: Removed 5 rows containing non-finite values (stat_boxplot).

plot of chunk qc_spike1_by_errortype

Nextera primers.

qplot(data = qc, Error, Nextera, geom = "boxplot") + facet_wrap(~Run, scales = "free") + ggtitle('Nextera primers by error type') + scale_x_discrete('Error type: 0 = cell present; 1 = cell absent; 2 = debris; 3 = wrong focus; 4 = more than 1 cell') + scale_y_continuous('Nextera primers')
## Warning: Removed 5 rows containing non-finite values (stat_boxplot).

plot of chunk qc_nextera_by_errortype

Median DNA yield

The DNA yield of chambers with no cells varies from run to run. Note that all chambers contain spikes, so the yield will not be null.

yield <- with(subset(qc, as.character(Error) < 2), tapply(Concentration, list(Run, drop.levels(Error)), median))
colnames(yield) <- c('one cell', 'no cell')
yield
##              one cell no cell
## 1772-062-248   3.5050  0.1080
## 1772-062-249   2.2910  0.2260
## 1772-064-103   0.3090  0.1020
## 1772-067-038   0.9490  0.0925
## 1772-067-039   0.9515  0.0520

Removing chambers which failed fluorescence QC

qc <- subset(qc, fluo_QC != TRUE,)
summary(qc)
##    cell_id                    Run          Well          Row         Column    Concentration  
##  Length:289         1772-062-248:36   C07    :  5   D      :50   1      : 26   Min.   :0.108  
##  Class :character   1772-062-249:42   D02    :  5   E      :39   2      : 26   1st Qu.:0.488  
##  Mode  :character   1772-064-103:57   D03    :  5   C      :38   7      : 26   Median :1.016  
##                     1772-067-038:82   D05    :  5   A      :36   9      : 26   Mean   :1.362  
##                     1772-067-039:72   D09    :  5   F      :36   4      : 24   3rd Qu.:2.030  
##                                       D12    :  5   B      :35   5      : 24   Max.   :4.781  
##                                       (Other):259   (Other):55   (Other):137                  
##     mean_ch2       bg_mean_ch2       mean_ch3       bg_mean_ch3                 Error    
##  Min.   : 16.70   Min.   :14.94   Min.   : 12.07   Min.   : 9.701   0-No Error     :289  
##  1st Qu.: 26.60   1st Qu.:17.35   1st Qu.: 26.34   1st Qu.:12.686   1-No cell      :  0  
##  Median : 34.97   Median :22.12   Median : 48.99   Median :13.234   2-Debris       :  0  
##  Mean   : 46.85   Mean   :21.30   Mean   : 53.07   Mean   :13.678   3-OutOfFocus   :  0  
##  3rd Qu.: 65.94   3rd Qu.:23.10   3rd Qu.: 71.63   3rd Qu.:14.358   4-MultipleCells:  0  
##  Max.   :135.99   Max.   :27.25   Max.   :177.03   Max.   :22.664                        
##                                                                                          
##   fluo_QC        ch2_corrected    ch3_corrected    low.complexity        SPIKE_1        
##  Mode :logical   Min.   : 10.70   Min.   : 11.21   Min.   :   514.9   Min.   :   28.66  
##  FALSE:289       1st Qu.: 14.81   1st Qu.: 23.41   1st Qu.:  2348.4   1st Qu.:  158.84  
##  NA's :0         Median : 22.25   Median : 51.63   Median :  4059.9   Median : 1173.60  
##                  Mean   : 38.83   Mean   : 53.76   Mean   :  5287.3   Mean   : 4044.16  
##                  3rd Qu.: 57.97   3rd Qu.: 78.29   3rd Qu.:  5828.2   3rd Qu.: 6957.50  
##                  Max.   :128.22   Max.   :145.92   Max.   :253685.6   Max.   :29382.25  
##                                                    NA's   :4          NA's   :4         
##     SPIKE_4             SPIKE_7          SPIKE_3           SPIKE_6            rRNA_18S      
##  Min.   :   0.0000   Min.   :  0.00   Min.   : 0.0000   Min.   :0.000000   Min.   :   0.00  
##  1st Qu.:   0.3359   1st Qu.:  0.00   1st Qu.: 0.0000   1st Qu.:0.000000   1st Qu.:  67.73  
##  Median :  24.0277   Median :  0.00   Median : 0.0000   Median :0.000000   Median : 314.87  
##  Mean   : 273.6173   Mean   : 25.09   Mean   : 0.8069   Mean   :0.008203   Mean   : 772.45  
##  3rd Qu.: 495.8608   3rd Qu.: 34.26   3rd Qu.: 0.0000   3rd Qu.:0.000000   3rd Qu.:1209.21  
##  Max.   :1665.7146   Max.   :273.32   Max.   :75.3734   Max.   :1.283028   Max.   :5107.16  
##  NA's   :4           NA's   :4        NA's   :4         NA's   :4          NA's   :4        
##     rRNA_28S         rRNA_5.8S          Nextera            HPV             HPV_as       Control   
##  Min.   :   0.00   Min.   : 0.0000   Min.   : 50318   Min.   :   0.0   Min.   :   0.0   NC  :  0  
##  1st Qu.:  18.14   1st Qu.: 0.0000   1st Qu.: 82841   1st Qu.: 893.9   1st Qu.: 328.5   PC  :  0  
##  Median :  50.36   Median : 0.6358   Median : 95233   Median :1460.1   Median : 512.9   NA's:289  
##  Mean   :  59.10   Mean   : 1.5019   Mean   :103690   Mean   :1641.3   Mean   : 578.5             
##  3rd Qu.:  82.77   3rd Qu.: 1.8737   3rd Qu.:122796   3rd Qu.:2146.5   3rd Qu.: 760.2             
##  Max.   :1119.20   Max.   :20.0407   Max.   :188657   Max.   :5904.7   Max.   :2166.2             
##  NA's   :4         NA's   :4         NA's   :4        NA's   :4        NA's   :4                  
##      Yield          Percent.PF      Reads         raw.clusters.per.lane Perfect.Index.Reads
##  Min.   :   0.0   Min.   :100   Min.   :      0   Min.   :0.000         Min.   :100        
##  1st Qu.: 395.0   1st Qu.:100   1st Qu.:2613214   1st Qu.:0.990         1st Qu.:100        
##  Median : 495.0   Median :100   Median :3275860   Median :1.110         Median :100        
##  Mean   : 483.8   Mean   :100   Mean   :3204306   Mean   :1.125         Mean   :100        
##  3rd Qu.: 569.0   3rd Qu.:100   3rd Qu.:3766192   3rd Qu.:1.290         3rd Qu.:100        
##  Max.   :1122.0   Max.   :100   Max.   :7430660   Max.   :2.860         Max.   :100        
##                   NA's   :2                                             NA's   :2          
##    Q30.bases     Mean.Quality.Score  HiSeq_QC      
##  Min.   :43.38   Min.   :17.15      Mode :logical  
##  1st Qu.:74.27   1st Qu.:30.23      FALSE:10       
##  Median :77.84   Median :31.22      TRUE :279      
##  Mean   :79.76   Mean   :31.67      NA's :0        
##  3rd Qu.:86.01   3rd Qu.:33.34                     
##  Max.   :92.20   Max.   :35.28                     
##  NA's   :2       NA's   :2

Removing chambers which were affected by pipetting errors.

The libraries with pipetting errors failed the HiSeq QC.

qc <- subset(qc, HiSeq_QC == TRUE)

Removing low-complexity libraries.

Some libraries in run 1772-067-039 are outliers with low yield, no fluorescence and higher proportion of low-complexity reads, therefore we remove them.

summary(subset(qc, low.complexity > 9000 & Run == '1772-067-039'))
##    cell_id                    Run         Well        Row        Column  Concentration   
##  Length:4           1772-062-248:0   B10    :1   B      :1   10     :3   Min.   :0.1580  
##  Class :character   1772-062-249:0   C02    :1   C      :1   2      :1   1st Qu.:0.1857  
##  Mode  :character   1772-064-103:0   G10    :1   G      :1   1      :0   Median :0.2220  
##                     1772-067-038:0   H10    :1   H      :1   3      :0   Mean   :0.2195  
##                     1772-067-039:4   A01    :0   A      :0   4      :0   3rd Qu.:0.2557  
##                                      A02    :0   D      :0   5      :0   Max.   :0.2760  
##                                      (Other):0   (Other):0   (Other):0                   
##     mean_ch2      bg_mean_ch2       mean_ch3      bg_mean_ch3                Error  
##  Min.   :22.65   Min.   :21.25   Min.   :13.23   Min.   :13.15   0-No Error     :4  
##  1st Qu.:22.96   1st Qu.:21.61   1st Qu.:13.27   1st Qu.:13.28   1-No cell      :0  
##  Median :23.32   Median :21.88   Median :13.50   Median :13.37   2-Debris       :0  
##  Mean   :23.28   Mean   :21.82   Mean   :13.53   Mean   :13.35   3-OutOfFocus   :0  
##  3rd Qu.:23.64   3rd Qu.:22.09   3rd Qu.:13.75   3rd Qu.:13.45   4-MultipleCells:0  
##  Max.   :23.82   Max.   :22.26   Max.   :13.89   Max.   :13.52                      
##                                                                                     
##   fluo_QC        ch2_corrected   ch3_corrected   low.complexity     SPIKE_1        SPIKE_4       
##  Mode :logical   Min.   :11.56   Min.   :12.83   Min.   : 9987   Min.   :1174   Min.   : 0.8575  
##  FALSE:4         1st Qu.:12.40   1st Qu.:12.86   1st Qu.:10803   1st Qu.:1238   1st Qu.: 1.7202  
##  NA's :0         Median :12.80   Median :13.21   Median :11279   Median :1565   Median :35.4370  
##                  Mean   :12.59   Mean   :13.21   Mean   :11127   Mean   :1686   Mean   :39.5316  
##                  3rd Qu.:13.00   3rd Qu.:13.56   3rd Qu.:11603   3rd Qu.:2012   3rd Qu.:73.2484  
##                  Max.   :13.19   Max.   :13.57   Max.   :11964   Max.   :2439   Max.   :86.3948  
##                                                                                                  
##     SPIKE_7           SPIKE_3     SPIKE_6     rRNA_18S        rRNA_28S        rRNA_5.8S     
##  Min.   : 0.0000   Min.   :0   Min.   :0   Min.   :245.7   Min.   : 65.87   Min.   :0.8575  
##  1st Qu.: 0.6432   1st Qu.:0   1st Qu.:0   1st Qu.:391.8   1st Qu.: 79.71   1st Qu.:0.9124  
##  Median : 6.0125   Median :0   Median :0   Median :444.1   Median : 84.97   Median :1.7484  
##  Mean   : 8.5277   Mean   :0   Mean   :0   Mean   :401.4   Mean   : 84.46   Mean   :3.0964  
##  3rd Qu.:13.8971   3rd Qu.:0   3rd Qu.:0   3rd Qu.:453.7   3rd Qu.: 89.72   3rd Qu.:3.9324  
##  Max.   :22.0858   Max.   :0   Max.   :0   Max.   :471.6   Max.   :102.05   Max.   :8.0312  
##                                                                                             
##     Nextera            HPV             HPV_as       Control      Yield         Percent.PF 
##  Min.   :122796   Min.   : 7.445   Min.   : 6.514   NC  :0   Min.   :150.0   Min.   :100  
##  1st Qu.:127052   1st Qu.:25.202   1st Qu.:18.946   PC  :0   1st Qu.:159.0   1st Qu.:100  
##  Median :128875   Median :44.288   Median :35.127   NA's:4   Median :169.0   Median :100  
##  Mean   :128515   Mean   :44.107   Mean   :42.074            Mean   :166.2   Mean   :100  
##  3rd Qu.:130339   3rd Qu.:63.193   3rd Qu.:58.255            3rd Qu.:176.2   3rd Qu.:100  
##  Max.   :133514   Max.   :80.407   Max.   :91.527            Max.   :177.0   Max.   :100  
##                                                                                           
##      Reads         raw.clusters.per.lane Perfect.Index.Reads   Q30.bases     Mean.Quality.Score
##  Min.   : 996116   Min.   :0.4200        Min.   :100         Min.   :68.27   Min.   :28.25     
##  1st Qu.:1054938   1st Qu.:0.4425        1st Qu.:100         1st Qu.:68.83   1st Qu.:28.52     
##  Median :1120336   Median :0.4700        Median :100         Median :69.31   Median :28.66     
##  Mean   :1101460   Mean   :0.4625        Mean   :100         Mean   :69.14   Mean   :28.59     
##  3rd Qu.:1166858   3rd Qu.:0.4900        3rd Qu.:100         3rd Qu.:69.61   3rd Qu.:28.73     
##  Max.   :1169052   Max.   :0.4900        Max.   :100         Max.   :69.66   Max.   :28.77     
##                                                                                                
##  HiSeq_QC      
##  Mode:logical  
##  TRUE:4        
##  NA's:0        
##                
##                
##                
## 
qc <- subset(qc, ! (low.complexity > 9000 & Run == '1772-067-039'))

Two libraries 1772-067-038 have quantities of spikes that are way higher than the average, suggesting bad quality. It is not sure whether they should be removed from the final analysis, but removing them now helps the readability of the plots below.

#  Disabled: removing them causes one of the plots to crash due to problems on guessing the scale.
#subset(qc, Run == '1772-067-038' & SPIKE_1 > 1)
#summary(subset(qc, Run == '1772-067-038' & SPIKE_1 < 1, SPIKE_1))
#qc <- subset(qc, ! (Run == '1772-067-038' & SPIKE_1 > 1))

There is an inverse correlation between the quantity of spike reads and the DNA yield.

If the spikes provide S molcules and the cells provide C molecules, then the spike ration should be S / (S − C), which also equals to 1 − C / (S + C). In addition, the DNA yield should be proportional to the total number of molecules, S + C. Therefore, the spike ratio, aproximated by the SPIKE_1 value, has an inverse relationship with the DNA yield, measured by the Concentration value.

ggplot(
  data=qc,
  aes(Concentration, SPIKE_1)) + 
  geom_point() + facet_wrap('Run', scale='free') + stat_quantile(formula='y ~ x')

plot of chunk unnamed-chunk-10

#ggplot(
#  data=qc,
#  aes(Concentration, SPIKE_1)) + 
#  geom_point() + facet_wrap('Run', scale='free') + stat_quantile(formula='y ~ x') + 
#  scale_x_continuous(trans = trans_new('inverse', transform = function(x) 1 / x, inverse = function(x) 1 / x), name='concentration (inverse scale)')

ggplot(
  data=qc,
  aes(
    ave(qc$Concentration, qc$Run, FUN = function(X) {rank(X) / length(X)}),
    ave(qc$SPIKE_1, qc$Run, FUN = function(X) {rank(X) / length(X)}))) + 
  geom_point()  + facet_wrap('Run') + stat_quantile(formula='y ~ x') +
  scale_x_continuous(name="rank/length concentration") +
  scale_y_continuous(name="rank/length SPIKE_1")

plot of chunk unnamed-chunk-10

Relation between fluorescence intensity and DNA concentration.

Not much for the moment…

with(qc,
  plot(
    data.frame(
      logCh2Corrected = log(ch2_corrected + 1),
      logCh3Corrected = log(ch3_corrected + 1),
      Concentration, low.complexity, SPIKE_1, HPV), col=Run))

plot of chunk unnamed-chunk-11

ggplot(data=qc, aes(Concentration, log(ch3_corrected))) + geom_point() + facet_wrap('Run', scale='free')

plot of chunk unnamed-chunk-11

Add a global QC flag and export the full table

qc.full$Discard <- FALSE
qc.full[ !is.na(qc.full$Control), 'Discard'] <- TRUE
qc.full[ is.na(qc.full$fluo_QC), 'Discard'] <- TRUE
qc.full[ which(qc.full$fluo_QC == TRUE), 'Discard'] <- TRUE
qc.full[ !qc.full$HiSeq_QC, 'Discard'] <- TRUE

write.csv(qc.full, file='../combine_all/combined.csv', row.names=FALSE)